Machine learning 1 in python with sklearn

Python
Author

Tony Duan

Published

October 15, 2023

1 package

Code
#!python3 -m pip install ydata-profiling
Code
#!python3 -m pip install pydantic-settings
Code
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

from siuba import _, mutate, filter, group_by, summarize,show_query
from siuba import *

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
#%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from pandas import get_dummies

2 data

Code
import os
os.listdir('data') 
['test.csv', 'train.csv', 'gender_submission.csv']

from https://www.kaggle.com/c/titanic/data

Code
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
combine = [train_df, test_df]

3 EDA

Code
print(train_df.columns.values)
['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']
Code
# preview the data
train_df.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
Code
train_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
Code
test_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB
Code
train_df.describe()
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
Code
profile_report=ProfileReport(train_df, title=f"Pandas Profiling Report for Titanic Dataset")
Code
profile_report.to_file("profile.html")
Code
profile_report.to_notebook_iframe()

4 data clean

4.1 target variable:

Code
y = train_df["Survived"]

4.2 feature engineering on model variable:

4.2.1 one hot encode with get_dummies()

Code
one_hot_features = ["Pclass", "Sex", "SibSp", "Parch"]

train_df_encoded=get_dummies(train_df[one_hot_features])

test_df_encoded=get_dummies(test_df[one_hot_features])

4.2.2 handling missing

Code
train_df.describe()
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
Code
missing_features = ["Age", "Fare"]

train_df_missing=train_df[missing_features].fillna(train_df[missing_features].mean())

test_df_missing=test_df[missing_features].fillna(test_df[missing_features].mean())

4.2.3 combine

for model x:

Code
# appending multiple DataFrame
x = pd.concat([train_df_encoded, train_df_missing], axis=1, join='inner')

for final x:

Code
# appending multiple DataFrame
final_x = pd.concat([test_df_encoded, test_df_missing], axis=1, join='inner')

4.3 split

Code
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.9, random_state = 123)
Code
x_train.shape
(801, 7)
Code
x_test.shape
(90, 7)
Code
x_train.head()
Pclass SibSp Parch Sex_female Sex_male Age Fare
677 3 0 0 True False 18.000000 9.8417
547 2 0 0 False True 29.699118 13.8625
317 2 0 0 False True 54.000000 14.0000
261 3 4 2 False True 3.000000 31.3875
273 1 0 1 False True 37.000000 29.7000

5 model

5.1 SV

Code
# Support Vector Machines
svc = SVC()
svc.fit(x_train, y_train)

#acc on trainning data
acc_svc_train = round(svc.score(x_train, y_train) * 100, 2)
print(acc_svc_train)
67.79
Code
#acc on testing data
acc_svc_test = round(svc.score(x_test, y_test) * 100, 2)
print(acc_svc_test)

#predication on test
svc_pred = svc.predict(final_x)
75.56

5.2 KNN

Code
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(x_train, y_train)

#acc on training data
acc_knn_train = round(knn.score(x_train, y_train) * 100, 2)
print(acc_knn_train)
83.65
Code
#acc on testing data
acc_knn_test=round(knn.score(x_test, y_test) * 100, 2)
print(acc_knn_test)

#predication on test
knn_pred = knn.predict(final_x)
76.67

5.3 Random Forest

Code
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train, y_train)

#acc on training data
acc_random_forest_train = round(random_forest.score(x_train, y_train) * 100, 2)
print(acc_random_forest_train)
98.25
Code
#acc on testing data
acc_random_forest_test = round(random_forest.score(x_test, y_test) * 100, 2)
print(acc_random_forest_test)

Y_pred = random_forest.predict(final_x)
82.22

6 Benchmark

Code
y_train.head()
677    1
547    1
317    0
261    1
273    0
Name: Survived, dtype: int64
Code
y_train_data=(y_train.to_frame())
y_train_data=y_train_data[10:15]
Code
y_train_data
Survived
164 0
659 0
536 0
875 1
196 0
Code
y_train_data=y_train_data.reset_index(drop=True)

y_train_data
Survived
0 0
1 0
2 0
3 1
4 0
Code
y_train_data2=y_train_data>>mutate(correct=if_else(_.Survived>0,1,0))
y_train_data2
Survived correct
0 0 0
1 0 0
2 0 0
3 1 1
4 0 0
Code
from collections import Counter
Counter(y_train)
Counter({0: 487, 1: 314})

guess all are 0

Code
# one way create 'compare' using mutate from siuba
base_data=(y_train.to_frame()).reset_index(drop=True)>>mutate(pred=0)>>mutate(compare=if_else(_.Survived==_.pred,1,0))

# or create 'compare' using np.where()
base_data=y_train.to_frame()>>mutate(pred=0)
base_data['compare'] = np.where(base_data['Survived']!= base_data['pred'], 0, 1)

the dummy acc is 60%

Code
sum(base_data['compare'])/count(base_data)
n
0 0.60799

6.1 final perdication

using Random forest:

Code
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
    
submission.head()
PassengerId Survived
0 892 0
1 893 0
2 894 1
3 895 1
4 896 0
Code
submission.to_csv('submission.csv',index=False)

7 Reference

https://www.kaggle.com/c/titanic/data

https://kevinwang09.github.io/compare-r-python/workflows.html

https://www.kaggle.com/code/startupsci/titanic-data-science-solutions